#REPLICATION OF BLOOM, SCHANKERMAN, & VAN REENEN 2013
#Gov 2001
#Replication authors: Carlos Carpi, Ian Lundberg, and Rohan Thavarajah
#Authors listed alphabetically

#install.packages("foreign")
library(foreign)
#YOU NEED TO CHANGE THIS LINE TO THE FILE LOCATION ON YOUR COMPUTER
setwd("F:\\Bloom Spillovers 2013\\Bloom Spillovers 2013 DATA\\")
statadata <- read.dta("spillover_final2.dta")

#NOTES:
#Key methods: Use lm() to estimate the model with the coefficients.
#Apply NeweyWest() to the lm() object to get Newey-West corrected standard errors,
#which account for serial autocorrelation of order 1 and arbitrary heteroskedasticity.
#NeweyWest has no impact on the coefficients, only the standard errors.
#For IV regression, use ivreg (from AER package) and the adjust standard errors with NeweyWest.

#The line below is just an adjustment so we get decimal output rather than exponential notation
	options("scipen" = 10)

#############################################
#Get intraclass correlation for Jaffe measure
#############################################

data.jaffe <- read.dta("spilltec_corr_long.dta")
data.jaffe$combo <- interaction(factor(data.jaffe$cusip), factor(data.jaffe$cusip2), drop = TRUE)
subset.data.jaffe <- subset(data.jaffe, year>=1972& year<=1997)
#Get intraclass correlation coefficient for Jaffe measure
#install.packages("ICC")
library(ICC)
icc.jaffe <- ICCbareF(combo, tech, data = subset.data.jaffe)
icc.jaffe

###################################################
#Get intraclass correlation for Mahalanobis measure
###################################################

data.mah <- read.dta("spilltec_mal_corr_long.dta")
data.mah$combo <- interaction(factor(data.mah$cusip), factor(data.mah$cusip2), drop = TRUE)
subset.data.mah <- subset(data.mah, year>=1972& year<=1997)
#Get intraclass correlation coefficient for Mahalanobis measure
#ICCbareF estimates a one-way ANOVA with random effects, but it does so
#using algorithms to increase speed for a large data set
icc.mah <- ICCbareF(combo, tech, data = subset.data.mah)
icc.mah



#Names of some key variables
#tobinq: Tobin's Q
#rmkvaf: Market value
#grd: R&D stock
#grd_k1: R&D stock/fixed capital
#rxrd: R&D flow
#gspilltec: SPILLTECH
#gspillsic: SPILLSIC
#pat_count: Patent flow
#pat_cite: Cite-weighted patents
#rsales: Sales
#rppent: Fixed capital
#emp: Employment (in thousands)
#lq: ln(Tobin's Q)
#lgspilltec_roll1: ln(SPILLTECH)_(t-1), Jaffe distance measure
#lgspillsic1: ln(SPILLSIC)_(t-1), Jaffe distance measure
#lgspillmaltec_roll1, Mahalanobis distance measure
#lgspillmalsic1, Mahalanobis distance measure
#grd_k1 + grd_kt2 + grd_kt3 + grd_kt4 + grd_kt5: (R&D Stock / Capital Stock)^n_(t-1)
	#The above is included as a 6th order polynomial, only first term shown
#grd_k1_dum: dummy variable for observations where lagged R&D stock is 0
#lsales_ind: Current industry sales in each firm's output industry
#lsales_ind1: Lagged sales in each firm's output industry
#factor(year): Full set of year dummies
#factor(num): Full set of firm dummies

#Instruments
#lgspilltecIV1: ln(TECHTAX)
#lgspillsicIV1: ln(SICTAX)

#lfirm: Federal tax credit value
#lstate: State tax credit value
#firm_dum: 



#Table 1
	#Make the data matrix for the table
	data_table1 <- statadata[which(statadata$year>1984 & statadata$year<2001), ]
	data_table1_nm_timeinvar <- na.omit(data_table1[c("lq", "lgspilltec1", "lgspillsic1", "grd_k1", "grd_k1_dum", "grd_kt2", "grd_kt3", "grd_kt4", "grd_kt5", "grd_kt6", "lsales_ind", "lsales_ind1", "year", "num", "lgspillmaltec1", "lgspillmalsic1", "lgspilltecIV1", "lgspillsicIV1", "lfirm", "lstate", "firm_dum")])
	data_table3_nm <-           na.omit(data_table3[c("lq", "lgspilltec1", "lgspillsic1", "grd_k1", "grd_k1_dum", "grd_kt2", "grd_kt3", "grd_kt4", "grd_kt5", "grd_kt6", "lsales_ind", "lsales_ind1", "year", "num", "lgspillmaltec1", "lgspillmalsic1", "lgspilltecIV1", "lgspillsicIV1", "lfirm", "lstate", "firm_dum")])
	data_table1_nm_timevar <- na.omit(data_table1[c("lq", "lgspilltec_roll1", "lgspillsic1", "grd_k1", "grd_k1_dum", "grd_kt2", "grd_kt3", "grd_kt4", "grd_kt5", "grd_kt6", "lsales_ind", "lsales_ind1", "year", "num", "lgspillmaltec_roll1", "lgspillmalsic1", "lgspilltecIV1", "lgspillsicIV1", "lfirm", "lstate", "firm_dum")])
	#Check that we have 9,944 observations for time-invariant
	nrow(data_table1_nm_timeinvar)
	#Check that we have 6,209 observations for time-variant
	nrow(data_table1_nm_timevar)
	#Reported coefficients in the paper are those on lgspilltec_roll1, lgspillsic1, and grd_k1
	#Table 1, Column 1: OLS Jaffe, no firm fixed effects, time-invariant tech distance
		#Use lm to get coefficients
		t1c1 <- lm(lq~lgspilltec1 + lgspillsic1 + grd_k1 + grd_k1_dum + grd_kt2 + grd_kt3 + grd_kt4 + grd_kt5 + grd_kt6 + lsales_ind + lsales_ind1 + factor(year), data = data_table1_nm_timeinvar)
		#Use NeweyWest() to adjust standard errors for autocorrelation (maximum lag 1) and arbitrary heteroskedasticity
		#NeweyWest() outputs the variance-covariance matrix of the coefficients, so we want the square root of the diagonal elements
		#See documentation with help(NeweyWest) or at http://127.0.0.1:26613/library/sandwich/html/NeweyWest.html
		library(sandwich)
		t1c1_se <- sqrt(diag(NeweyWest(t1c1, lag = 1, order.by = NULL, prewhite = FALSE, adjust = TRUE, diagnostics = FALSE, sandwich = TRUE, verbose = TRUE)))
		#Make a table reporting the coefficients and Newey-West standard errors
		t1c1_results <- cbind(t1c1$coef,t1c1_se)
		#I'm just going to output the head, since it has all the coefficients we care about
		head(t1c1_results)
	#Table 1, Column 2: OLS Jaffe, with firm fixed effects, time-invariant tech distance
		t1c2 <- lm(lq~lgspilltec1 + lgspillsic1 + grd_k1 + grd_k1_dum + grd_kt2 + grd_kt3 + grd_kt4 + grd_kt5 + grd_kt6 + lsales_ind + lsales_ind1 + factor(year) + factor(num), data = data_table1_nm_timeinvar)
		t1c2_se <- sqrt(diag(NeweyWest(t1c2, lag = 1, order.by = NULL, prewhite = FALSE, adjust = TRUE, diagnostics = FALSE, sandwich = TRUE, verbose = TRUE)))
		t1c2_results <- cbind(t1c2$coef,t1c2_se)
		head(t1c2_results)
	#Table 1, Column 3: OLS Jaffe, with firm fixed effects, time-varying tech distance
		t1c3 <- lm(lq~lgspilltec_roll1 + lgspillsic1 + grd_k1 + grd_k1_dum + grd_kt2 + grd_kt3 + grd_kt4 + grd_kt5 + grd_kt6 + lsales_ind + lsales_ind1 + factor(year) + factor(num), data = data_table1_nm_timevar)
		t1c3_se <- sqrt(diag(NeweyWest(t1c3, lag = 1, order.by = NULL, prewhite = FALSE, adjust = TRUE, diagnostics = FALSE, sandwich = TRUE, verbose = TRUE)))
		t1c3_results <- cbind(t1c3$coef,t1c3_se)
		head(t1c3_results)



################
#APPENDIX TABLES
################

#NOTE: Since the appendix involves a lot of different models, the table
#numbers here refer to the tables in which the results were originally
#presented in Bloom et al. (2013) in order to avoid confusion between models.

###########################################
#APPENDIX TABLE A1. JAFFE DISTANCE MEASURE#
###########################################
	#Log Tobin's Q models are same as presented in Table 1 of main text
	
#CITE-WEIGHTED PATENTS MODELS
	#Make the data matrix for the cite-weighted patents
	data_table4 <- statadata[which(statadata$year<1999 & statadata$year>(statadata$fyear+statadata$prior_years)), ]
	#All models include time dummies and industry dummies
	#Merge software together as otherwise does not produce standard errors as too many industry codes
		data_table4$sic[data_table4$sic==6211] <- 999
		data_table4$sic[data_table4$sic==7370] <- 7373
	data_table4_nm_timeinvar <- na.omit(data_table4[c("pat_cite", "lpat_cite1", "lpat_cite1_dum", "lgrd1", "lgrd1_dum", "lgspilltec1", "lgspillsic1", "lsales1", "lpriorpat_cite", "lpriorpat_cite_dum", "num", "year", "lgspillmaltec1", "lgspillmalsic1", "lgspilltecIV1", "lgspillsicIV1", "year", "sic", "lghxrd1")])
	data_table4_nm_timevar <- na.omit(data_table4[c("pat_cite", "lpat_cite1", "lpat_cite1_dum", "lgrd1", "lgrd1_dum", "lgspilltec_roll1", "lgspillsic1", "lsales1", "lpriorpat_cite", "lpriorpat_cite_dum", "num", "year", "lgspillmaltec_roll1", "lgspillmalsic1", "lgspilltecIV1", "lgspillsicIV1", "year", "sic", "lghxrd1")])
	#Check that we have 9,023 observations for time-invariant
	nrow(data_table4_nm_timeinvar)
	#Check that we have 5,544 observations for time-varying tech distance
	nrow(data_table4_nm_timevar)
	#In this table, standard errors are clustered rather than corrected with Newey-White
	#install.packages("MASS")
	library(MASS)
	#Standard error issue for negative binomial (paper clusters SEs)
		#Some code to estimate cluster-robust standard errors.
		#Taken from http://projects.iq.harvard.edu/files/gov2001/files/section7_2014_print.pdf, slides 77-78
		cl.error <- function (model, clustvar) {
			fc <- clustvar
			m <- length(unique(fc))
			k <- length(coef(model))
			u <- estfun(model)
			u.clust <- matrix(NA, nrow=m, ncol=k)
			for(j in 1:k){
				u.clust[,j] <- tapply(u[,j], fc, sum)
			}
			return(sqrt(diag(vcov(model) %*% ((m / (m-1)) * t(u.clust) %*% (u.clust)) %*% vcov(model))))
		}
	#Table 4, Column 1: NBin Jaffe, time-invariant tech distance
		t4c1 <- glm.nb(pat_cite ~ lgspilltec1 + lgspillsic1 + lgrd1 + lgrd1_dum + lsales1 + factor(sic) + factor(year), data = data_table4_nm_timeinvar)
		#Clustered standard errors
		t4c1_se_cl <- cl.error(model = t4c1, clustvar = data_table4_nm_timeinvar$num)
		head(cbind(t4c1$coef, t4c1_se_cl))
	#Table 4, Column 2: NBin Jaffe, time-invariant tech distance, with fixed-effects via "pre-sample mean scaling approach" (Blundell, Griffith, & Van Reenen 1999), lpriorpat_cite
		t4c2 <- glm.nb(pat_cite ~ lgspilltec1 + lgspillsic1 + lgrd1 + lpriorpat_cite + lpriorpat_cite_dum + lgrd1_dum + lsales1 + factor(sic) + factor(year), data = data_table4_nm_timeinvar)
		t4c2_se_cl <- cl.error(model = t4c2, clustvar = data_table4_nm_timeinvar$num)
		head(cbind(t4c2$coef, t4c2_se_cl))
	#Table 4, Column 2: NBin Jaffe, time-varying tech distance, with fixed-effects via "pre-sample mean scaling approach" (Blundell, Griffith, & Van Reenen 1999), lpriorpat_cite
		t4c2 <- glm.nb(pat_cite ~ lgspilltec_roll1 + lgspillsic1 + lgrd1 + lpriorpat_cite + lpriorpat_cite_dum + lgrd1_dum + lsales1 + factor(sic) + factor(year), data = data_table4_nm_timevar)
		t4c2_se_cl <- cl.error(model = t4c2, clustvar = data_table4_nm_timevar$num)
		head(cbind(t4c2$coef, t4c2_se_cl))

#LOG OF SALES MODELS
	#Make the data matrix for the table
	data_table5 <- statadata[which(statadata$year>1984 & statadata$year<2001), ]
	data_table5_nm_timeinvar <- na.omit(data_table5[c("lsales", "lemp1", "lppent1", "lgrd1", "lgrd1_dum", "lgspilltec1", "lgspillsic1", "lgspillmaltec1", "lgspillmalsic1", "lgspilltecIV1", "lgspillsicIV1", "lsales_ind", "lsales_ind1", "lpind_ind", "year", "num")])
	data_table5_nm_timevar <- na.omit(data_table5[c("lsales", "lemp1", "lppent1", "lgrd1", "lgrd1_dum", "lgspilltec_roll1", "lgspillsic1", "lgspillmaltec_roll1", "lgspillmalsic1", "lgspilltecIV1", "lgspillsicIV1", "lsales_ind", "lsales_ind1", "lpind_ind", "year", "num")])
	#Check that we have 9,935 observations in time-invariant matrix
	nrow(data_table5_nm_timeinvar)
	#Check that we have 6,226 observations in time-varying matrix
	nrow(data_table5_nm_timevar)
	#Table 5, Column 1: OLS Jaffe
		t5c1 <- lm(lsales ~ lgspilltec1 + lgspillsic1 + lppent1 + lemp1 + lgrd1 + lgrd1_dum + lsales_ind + lsales_ind1 + lpind_ind + factor(year), data = data_table5_nm_timeinvar)
		t5c1_se <- sqrt(diag(NeweyWest(t5c1, lag = 1, order.by = NULL, prewhite = FALSE, adjust = TRUE, diagnostics = FALSE, sandwich = TRUE, verbose = TRUE)))
		t5c1_results <- cbind(t5c1$coef,t5c1_se)
		head(t5c1_results)
	#Table 5, Column 2: OLS Jaffe, with firm fixed effects
		t5c2 <- lm(lsales ~ lgspilltec1 + lgspillsic1 + lppent1 + lemp1 + lgrd1 + lgrd1_dum + lsales_ind + lsales_ind1 + lpind_ind + factor(year) + factor(num), data = data_table5_nm_timeinvar)
		t5c2_se <- sqrt(diag(NeweyWest(t5c2, lag = 1, order.by = NULL, prewhite = FALSE, adjust = TRUE, diagnostics = FALSE, sandwich = TRUE, verbose = TRUE)))
		t5c2_results <- cbind(t5c2$coef,t5c2_se)
		head(t5c2_results)
	#Table 5, Column 2: OLS Jaffe, with firm fixed effects, time-varying tech distance
		t5c2 <- lm(lsales ~ lgspilltec_roll1 + lgspillsic1 + lppent1 + lemp1 + lgrd1 + lgrd1_dum + lsales_ind + lsales_ind1 + lpind_ind + factor(year) + factor(num), data = data_table5_nm_timevar)
		t5c2_se <- sqrt(diag(NeweyWest(t5c2, lag = 1, order.by = NULL, prewhite = FALSE, adjust = TRUE, diagnostics = FALSE, sandwich = TRUE, verbose = TRUE)))
		t5c2_results <- cbind(t5c2$coef,t5c2_se)
		head(t5c2_results)

#LOG(R&D/SALES) MODELS
	#Make the data matrix for the table
	#I have to make two of them, since there are a few NA observations for the ln(R&D/Sales)_(t-1) variable (lxrd_sales1)
	data_table6_nm_timeinvar <- na.omit(statadata[c("lxrd_sales", "lgspilltec1", "lgspillsic1", "lgspillmaltec1", "lgspillmalsic1", "lgspilltecIV1", "lgspillsicIV1", "lsales_ind", "lsales_ind1", "lfirm", "firm_dum", "lstate", "year", "num")])
	data_table6_nm_timevar <- na.omit(statadata[c("lxrd_sales", "lgspilltec_roll1", "lgspillsic1", "lgspillmaltec_roll1", "lgspillmalsic1", "lgspilltecIV1", "lgspillsicIV1", "lsales_ind", "lsales_ind1", "lfirm", "firm_dum", "lstate", "year", "num")])
	#Check that we have 8,579 observations for the time-invariant
	nrow(data_table6_nm_timeinvar)
	#Check that we have 6,509 observations for the time-varying
	nrow(data_table6_nm_timevar)
	#Table 6, Column 1: OLS Jaffe
		t6c1 <- lm(lxrd_sales ~ lgspilltec1 + lgspillsic1 + lsales_ind + lsales_ind1 + factor(year), data = data_table6_nm_timeinvar)
		t6c1_se <- sqrt(diag(NeweyWest(t6c1, lag = 1, order.by = NULL, prewhite = FALSE, adjust = TRUE, diagnostics = FALSE, sandwich = TRUE, verbose = TRUE)))
		t6c1_results <- cbind(t6c1$coef,t6c1_se)
		head(t6c1_results)
	#Table 6, Column 2: OLS Jaffe, with firm fixed effects
		t6c2 <- lm(lxrd_sales ~ lgspilltec1 + lgspillsic1 + lsales_ind + lsales_ind1 + factor(year) + factor(num), data = data_table6_nm_timeinvar)
		t6c2_se <- sqrt(diag(NeweyWest(t6c2, lag = 1, order.by = NULL, prewhite = FALSE, adjust = TRUE, diagnostics = FALSE, sandwich = TRUE, verbose = TRUE)))
		t6c2_results <- cbind(t6c2$coef,t6c2_se)
		head(t6c2_results)
	#Table 6, Column 2: OLS Jaffe, with firm fixed effects, time-varying tech distance
		t6c2 <- lm(lxrd_sales ~ lgspilltec_roll1 + lgspillsic1 + lsales_ind + lsales_ind1 + factor(year) + factor(num), data = data_table6_nm_timevar)
		t6c2_se <- sqrt(diag(NeweyWest(t6c2, lag = 1, order.by = NULL, prewhite = FALSE, adjust = TRUE, diagnostics = FALSE, sandwich = TRUE, verbose = TRUE)))
		t6c2_results <- cbind(t6c2$coef,t6c2_se)
		head(t6c2_results)

################################################
#APPENDIX TABLE A2. MAHALNOBIS DISTANCE MEASURE#
################################################

#LOG(TOBIN'S Q)
	#Table 1, Column 2: OLS Jaffe, with firm fixed effects, time-invariant tech distance
		t1c2 <- lm(lq~lgspillmaltec1 + lgspillmalsic1 + grd_k1 + grd_k1_dum + grd_kt2 + grd_kt3 + grd_kt4 + grd_kt5 + grd_kt6 + lsales_ind + lsales_ind1 + factor(year) + factor(num), data = data_table1_nm_timeinvar)
		t1c2_se <- sqrt(diag(NeweyWest(t1c2, lag = 1, order.by = NULL, prewhite = FALSE, adjust = TRUE, diagnostics = FALSE, sandwich = TRUE, verbose = TRUE)))
		t1c2_results <- cbind(t1c2$coef,t1c2_se)
		head(t1c2_results)
	#Table 1, Column 3: OLS Jaffe, with firm fixed effects, time-varying tech distance
		t1c3 <- lm(lq~lgspillmaltec_roll1 + lgspillmalsic1 + grd_k1 + grd_k1_dum + grd_kt2 + grd_kt3 + grd_kt4 + grd_kt5 + grd_kt6 + lsales_ind + lsales_ind1 + factor(year) + factor(num), data = data_table1_nm_timevar)
		t1c3_se <- sqrt(diag(NeweyWest(t1c3, lag = 1, order.by = NULL, prewhite = FALSE, adjust = TRUE, diagnostics = FALSE, sandwich = TRUE, verbose = TRUE)))
		t1c3_results <- cbind(t1c3$coef,t1c3_se)
		head(t1c3_results)
	
#CITE-WEIGHTED PATENTS MODELS
	#Table 4, Column 2: NBin Jaffe, time-invariant tech distance, with fixed-effects via "pre-sample mean scaling approach" (Blundell, Griffith, & Van Reenen 1999), lpriorpat_cite
		t4c2 <- glm.nb(pat_cite ~ lgspillmaltec1 + lgspillmalsic1 + lgrd1 + lpriorpat_cite + lpriorpat_cite_dum + lgrd1_dum + lsales1 + factor(sic) + factor(year), data = data_table4_nm_timeinvar)
		t4c2_se_cl <- cl.error(model = t4c2, clustvar = data_table4_nm_timeinvar$num)
		head(cbind(t4c2$coef, t4c2_se_cl))
	#Table 4, Column 2: NBin Jaffe, time-varying tech distance, with fixed-effects via "pre-sample mean scaling approach" (Blundell, Griffith, & Van Reenen 1999), lpriorpat_cite
		t4c2 <- glm.nb(pat_cite ~ lgspillmaltec_roll1 + lgspillmalsic1 + lgrd1 + lpriorpat_cite + lpriorpat_cite_dum + lgrd1_dum + lsales1 + factor(sic) + factor(year), data = data_table4_nm_timevar)
		t4c2_se_cl <- cl.error(model = t4c2, clustvar = data_table4_nm_timevar$num)
		head(cbind(t4c2$coef, t4c2_se_cl))

#LOG OF SALES MODELS
	#Table 5, Column 2: OLS Jaffe, with firm fixed effects
		t5c2 <- lm(lsales ~ lgspillmaltec1 + lgspillmalsic1 + lppent1 + lemp1 + lgrd1 + lgrd1_dum + lsales_ind + lsales_ind1 + lpind_ind + factor(year) + factor(num), data = data_table5_nm_timeinvar)
		t5c2_se <- sqrt(diag(NeweyWest(t5c2, lag = 1, order.by = NULL, prewhite = FALSE, adjust = TRUE, diagnostics = FALSE, sandwich = TRUE, verbose = TRUE)))
		t5c2_results <- cbind(t5c2$coef,t5c2_se)
		head(t5c2_results)
	#Table 5, Column 2: OLS Jaffe, with firm fixed effects, time-varying tech distance
		t5c2 <- lm(lsales ~ lgspillmaltec_roll1 + lgspillmalsic1 + lppent1 + lemp1 + lgrd1 + lgrd1_dum + lsales_ind + lsales_ind1 + lpind_ind + factor(year) + factor(num), data = data_table5_nm_timevar)
		t5c2_se <- sqrt(diag(NeweyWest(t5c2, lag = 1, order.by = NULL, prewhite = FALSE, adjust = TRUE, diagnostics = FALSE, sandwich = TRUE, verbose = TRUE)))
		t5c2_results <- cbind(t5c2$coef,t5c2_se)
		head(t5c2_results)

#LOG(R&D/SALES) MODELS
	#Table 6, Column 2: OLS Jaffe, with firm fixed effects
		t6c2 <- lm(lxrd_sales ~ lgspillmaltec1 + lgspillmalsic1 + lsales_ind + lsales_ind1 + factor(year) + factor(num), data = data_table6_nm_timeinvar)
		t6c2_se <- sqrt(diag(NeweyWest(t6c2, lag = 1, order.by = NULL, prewhite = FALSE, adjust = TRUE, diagnostics = FALSE, sandwich = TRUE, verbose = TRUE)))
		t6c2_results <- cbind(t6c2$coef,t6c2_se)
		head(t6c2_results)
	#Table 6, Column 2: OLS Jaffe, with firm fixed effects, time-varying tech distance
		t6c2 <- lm(lxrd_sales ~ lgspillmaltec_roll1 + lgspillmalsic1 + lsales_ind + lsales_ind1 + factor(year) + factor(num), data = data_table6_nm_timevar)
		t6c2_se <- sqrt(diag(NeweyWest(t6c2, lag = 1, order.by = NULL, prewhite = FALSE, adjust = TRUE, diagnostics = FALSE, sandwich = TRUE, verbose = TRUE)))
		t6c2_results <- cbind(t6c2$coef,t6c2_se)
		head(t6c2_results)


